

local tracks_to_match `1' // "C:\Users\rklotz\Dropbox\Boats\Stata\results\CAtracks_wFuel\vessels_to_match"

local mmsi_imo_pairs_csv `2' // "C:\Users\rklotz\Dropbox\Boats\Stata\CA_tracks\vessel_list.csv" 

local mmsi_imo_my `3'

import delimited "`mmsi_imo_pairs_csv'" , clear

* cleaning imo number
* cleaning up any imo numbers with more than 7 digits (with 0 at end)
* clearing last digit(s) if zero
gen imo_div = imo/1e7
replace imo = . if imo_div < 0.1 // removes any imo numbers that don't have 7 digits

replace imo = imo/10 if imo_div>1 & imo_div<10 & mod(imo,10)==0
replace imo = imo/100 if imo_div>10 & imo_div<100 & mod(imo,100)==0

replace imo_div = imo/1e7
replace imo = . if imo_div > 1 // removes any imo numbers that have more than 7
drop imo_div

*replace length=. if length==0
*replace width=. if width==0

* creating mmsi string variable (easier to work with than long)
tostring mmsi, gen(mmsi_str) format(%10.0f)
gen mid = real(substr(mmsi_str, 1, 3)) // get country id

* categories vessels
rename vesseltype vesselType_detailed 
gen str vessel_str = "Other"
replace vessel_str = "Cargo" if  inrange(vesselType_detailed,70,79) | inlist(vesselType_detailed,1003,1004)
replace vessel_str = "Fishing" if  inlist(vesselType_detailed,30,1001,1002)
replace vessel_str = "Military" if  inlist(vesselType_detailed,35,1021)
replace vessel_str = "NA" if  inlist(vesselType_detailed,0)
replace vessel_str = "Passenger" if  inrange(vesselType_detailed,60,69) | inrange(vesselType_detailed,1012,1015)
replace vessel_str = "Pleasure" if  inlist(vesselType_detailed,36,37,1019)
replace vessel_str = "Tanker" if  inrange(vesselType_detailed,80,89) | inlist(vesselType_detailed,1017,1024)
replace vessel_str = "Tug" if  inlist(vesselType_detailed,21,22,31,32,52,1023,1025)
drop vesselType_detailed

gen mmsi_scrambled = 0
gen from_vessel_list = 1 

* dropping fishing, tug, military
drop if vessel_str=="Tug"
drop if vessel_str=="Fishing"
drop if vessel_str=="Military"
drop if vessel_str=="Pleasure"

* dropping duplicates b/c vessel type is more aggregate
duplicates drop mmsi imo length width vessel_str , force

* save 
save vessel_list , replace

* load tracks needing a match ************
append using  `tracks_to_match'

* updating vessel list variable
replace from_vessel_list = 0 if from_vessel_list==.

* SELECT WHICH SAMPLE OF MATCHES TO USE
*keep if mmsi_scrambled==0 & from_vessel_list==0

* creating vesseltype variable
encode vessel_str, gen(vesseltype) 
label list vesseltype

* checking IMO codes for validity
gen imo_str = string(imo)
gen i7 = real(substr(imo_str,7,1))

gen imo_check = 0 
foreach i of num 1/6 {
	replace imo_check = imo_check + real(substr(imo_str,`i',1))*(8-`i')
}
gen check_last = mod(imo_check,10)
gen imo_ok = check_last==i7 if mmsi_scrambled==0
drop imo_str i7 imo_check check_last

* clearing those with bad IMO numbers
drop if imo_ok==0 & mmsi_scrambled==0

* dropping those with missing imo
drop if (imo==0 | imo==.) & mmsi_scrambled==0

* dropping duplicates from both vessel list and tracks list
* small number in tracks list but not vessel list which is suprising
tab from_vessel_list mmsi_scrambled
duplicates drop mmsi imo length width vesseltype if mmsi_scrambled==0 , force
tab from_vessel_list mmsi_scrambled

* create last 6 digits of mmsi
gen mmsi6 = substr(mmsi_str, 4, 6)   //
gen mmsi6_float = real(mmsi6)

* create counts of each number 0-9 in mmsi
foreach i of num 0/9 {
	gen n`i' = length(mmsi6) - length(subinstr(mmsi6, "`i'", "", .))
}

* create indicator to merge back in later
gen id=_n

* save and update file to merge later (appending _m to all variables)
preserve
foreach var of varlist _all {
	rename `var' `var'_m
}
rename id_m id_match1 
save "mmsi_tmp", replace
restore

preserve
foreach var of varlist _all {
	rename `var' `var'_m2
}
rename id_m id_match2 
save "mmsi_tmp2", replace
restore

local exactmatch_list vesseltype n1 n2 n3 n4 n5 n6 n7 n8 n9 mid
capture teffects nnmatch (mmsi6_float length width) (mmsi_scrambled)  ///
	, nn(1) osample(nomatch) ematch( `exactmatch_list' ) metric(euclidean)
teffects nnmatch (mmsi6_float length width) (mmsi_scrambled) if nomatch==0 ///
  , nn(1) gen(id_match) ematch( `exactmatch_list' ) vce(iid) metric(euclidean)
drop n0-n9

replace id_match1=. if mmsi_scrambled==0
replace id_match2=. if mmsi_scrambled==0

* merge in mmsi information for matches
merge m:1 id_match1 using "mmsi_tmp" , keepusing(mmsi_str_m length_m width_m imo_m)
drop if _merge==2 // dropping those from using file that didn't match
drop _merge

merge m:1 id_match2 using "mmsi_tmp2" , keepusing(mmsi_str_m2 length_m2 width_m2 imo_m2)
drop if _merge==2 // dropping those from using file that didn't match
drop _merge

drop if mmsi_scrambled==0

* calculate match distance
gen match_dist = ( ( length_m - length )^2 + (width_m - width)^2  )^(1/2)
gen match_dist2 = ( ( length_m2 - length )^2 + (width_m2 - width)^2  )^(1/2)

egen n_matches = rowmiss(id_match1-id_match2) 
replace n_matches = 2 - n_matches

gen imo_same = (imo_m==imo_m2) // figure out if matched imo are the same
drop *_m2

gen match_success = match_dist<10 & ( n_matches==1 | (n_matches==2 & imo_same==1))

* displaying statistics
tab n_matches vesseltype
tab match_success vesseltype 

* drop poor matches
drop if match_success==0

* drop imo-mmsi duplicates
* this gets rid of duplicate mmsi that match to same imo
duplicates drop mmsi imo_m , force

* now look at mmsi that joined to different imo
duplicates tag mmsi  , gen(dup)

* calculate mean absolute deviation from mean
* trying to determine whether vessels sizes are very different
*keep if dup>0
egen l_mdev = mdev(length) if dup>0 , by(mmsi)
egen w_mdev = mdev(width) if dup>0 , by(mmsi)
gen lw_mdev = l_mdev + w_mdev if dup>0
egen my_mdev = mdev(my) if dup>0 , by(mmsi)

* checking if different vessels
by mmsi (vesseltype), sort: gen vesdiff = vesseltype[1] != vesseltype[_N]

* dropping if very little difference in size between to duplicate imo matches
* or if within one year 
gen dup_drop = ( lw_mdev<4 | my_mdev<6 ) & vesdiff==0 if dup>0

drop if dup_drop==1
drop dup *_mdev dup_drop



keep mmsi imo_m my mmsi_str_m // low high

save `mmsi_imo_my' , replace 

